The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
Object recognition
Apply dimensionality reduction technique – PCA and train a model using principle components instead of training the model using just the raw data.
#Importing All the Libriaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
sns.set_style(style='darkgrid') # Setting the plot style. Purely for aesthetic purposes
%matplotlib inline
#Read the data as 'Data' dataframe
Data = pd.read_csv('vehicle-1.csv')
#Reading the first five rows of dataset
Data.head()
#last 5 rows using tail() function
Data.tail()
#checking the shape of dataset. Dataset having 846 rows and 19 columns.
Data.shape
#To show the detailed summary
Data.info()
#class attribute is not an object it is a category
Data['class'] = Data['class'].astype('category')
#This is showing dataset having 3 classes, Car, bus and van.
Data['class'].value_counts()
#To get the number of columns
Data.columns
Data.describe().T
It gives the descriptive statistics (mean, median, mode, percentiles, min, max, standard deviation) of the columns of the dataset. By analysing it, we can see that:
compactness, circularity, distance_circularity, elongatedness, pr.axis_rectangularity, max.length_rectangularity, scaled_radius_of_gyration, scaled_radius_of_gyration.1, skewness_about.2, hollows_ratio are approximately normally distributed.
radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scatter_ratio, scaled_variance, scaled_variance.1, skewness_about, skewness_about.1 are approx. right skewed distribution.
# Checking Missing value present in the dataset. It seems that datset is having missing value
Data.apply(lambda x : sum(x.isnull()))
#replacing blank value by 'NAN'
Data = Data.replace(' ', np.nan)
Data.head(10)
#Replacing the missing values by median
for i in Data.columns[:17]:
median_value = Data[i].median()
Data[i] = Data[i].fillna(median_value)
#Checking the dataset again for missing value. It seems that all missing value are replaced with median.
Data.apply(lambda x : sum(x.isnull()))
Data.boxplot(figsize =(35,15))
#find the outliers and replace them by median
for column in Data.columns:
if column in ['radius_ratio','pr.axis_aspect_ratio', 'max.length_aspect_ratio','scaled_variance', 'scaled_variance.1','scaled_radius_of_gyration.1','skewness_about', 'skewness_about.1']:
q1 = Data[column].quantile(0.25)
q3 = Data[column].quantile(0.75)
iqr = q3 - q1
high = float(q3 + 1.5 * iqr)
low = float(q1 - 1.5 * iqr)
Data.loc[(Data[column]<low) | (Data[column] > high), column] = Data[column].median()
Data.plot(kind = 'box', figsize = (30,20))
#Showing Data distribution using Histogram
Data.hist(figsize=(15,15))
#Data Distribution using KDE
plt.figure(figsize= (30,20)) # Set the figure size
pos = 1 # a variable to manage the position of the subplot in the overall plot
for feature in Data.columns: # for-loop to iterate over every attribute whose distribution is to be visualized
plt.subplot(6, 4, pos) # plot grid
if feature in ['compactness', 'circularity', 'distance_circularity', 'radius_ratio','pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity','scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration','scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1','skewness_about.2', 'hollows_ratio']: # Plot histogram for all the continuous columns
sns.distplot(Data[feature], kde= True, kde_kws={'bw': 0.1} )
else:
sns.countplot(Data[feature], palette= 'Blues') # Plot bar chart for all the categorical columns
pos += 1 # to plot over the grid one by one
#Class is dependant attribute and having 3 unique categories as Car, bus and van.
Data['class'].value_counts()
sns.countplot(Data['class'])
#Encoding of categorical variables
from sklearn.preprocessing import LabelEncoder
labelencoder_X=LabelEncoder()
Data['class']=labelencoder_X.fit_transform(Data['class'])
#Correlation Matrix
corr = Data.corr()
corr
#Showing correlation between each variable using heatmap
corr = abs(Data.corr()) # correlation matrix
lower_triangle = np.tril(corr, k = -1) # select only the lower triangle of the correlation matrix
mask = lower_triangle == 0 # to mask the upper triangle in the following heatmap
plt.figure(figsize = (20,10)) # setting the figure size
sns.set_style(style = 'white') # Setting it to white so that we do not see the grid lines
sns.heatmap(lower_triangle, center=0.5, cmap= 'Blues', annot= True, xticklabels = corr.index, yticklabels = corr.columns,
cbar= False, linewidths= 1, mask = mask) # Da Heatmap
plt.xticks(rotation = 50) # Aesthetic purposes
plt.yticks(rotation = 20) # Aesthetic purposes
plt.show()
Form the above obesevation:
sns.pairplot(Data, hue = 'class', diag_kind='kde') # pairplot
plt.show()
Data.head()
#Now separate the dataframe into dependent and independent variables
X = Data.iloc[:,0:18].values
y = Data.iloc[:,18].values
from sklearn.preprocessing import StandardScaler
sc =StandardScaler()
X = sc.fit_transform(X)
#now split the data into 70:30 ratio
#orginal Data
Orig_X_train,Orig_X_test,Orig_y_train,Orig_y_test = train_test_split(X,y,test_size=0.30,random_state=4)
# Since we will be using multiple models using the same data, thhe following is a user defined function for that purpose
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
def fit_n_print(model, Orig_X_train, Orig_X_test, Orig_y_train, Orig_y_test): # take the model, and data as inputs
from sklearn import metrics
model.fit(Orig_X_train, Orig_y_train) # fir the model with the train data
Orig_pred_train = model.predict(Orig_X_train) # model predictions on the train data
Orig_pred = model.predict(Orig_X_test) # make predictions on the test set
score = round(model.score(Orig_X_test, Orig_y_test), 4) # compute accuracy score for test set
#Printing Confusion Matrix
cm = metrics.confusion_matrix(Orig_pred,Orig_y_test, labels = [0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in ["Van ", "Car ", "Bus"]],
columns = [i for i in ["Van ", "Car ", "Bus"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
print(classification_report(Orig_y_test,Orig_pred))
recall = metrics.recall_score(Orig_y_test, Orig_pred, average='macro')
precision = metrics.precision_score(Orig_y_test, Orig_pred, average='macro')
f1_score = round(2*precision*recall/(precision + recall),3)
return score, recall, precision, f1_score # return all the metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
# Fitting Logistic Regression to the Training set
lr = LogisticRegression(random_state = 4)
lr_accuracy, lr_recall, lr_precision, lr_f1_score = fit_n_print(lr, Orig_X_train, Orig_X_test, Orig_y_train, Orig_y_test)
#Printing Accuracy Score for model
print("\n Logistic Model Score:",lr_accuracy)
from sklearn import svm
from sklearn.svm import SVC
svm = SVC(gamma=0.025, C=3)
#fit the model on orighinal raw data
svm.fit(Orig_X_train,Orig_y_train)
svm_accuracy, svm_recall, svm_precision, svm_f1_score = fit_n_print(svm, Orig_X_train, Orig_X_test, Orig_y_train, Orig_y_test)
print("\n Support Vector Machine Model Score:",svm_accuracy)
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb_accuracy, nb_recall, nb_precision, nb_f1_score = fit_n_print(nb, Orig_X_train, Orig_X_test, Orig_y_train, Orig_y_test )
print("\n Naive Bayes Model Score:",nb_accuracy)
We will perform PCA in following steps:¶
#Now separate the dataframe into dependent and independent variables
X = Data.iloc[:,0:18].values
y = Data.iloc[:,18].values
from sklearn.preprocessing import StandardScaler
sc =StandardScaler()
X = sc.fit_transform(X)
#Calculating Covarience Matrix
cov_matrix = np.cov(X.T)
print("cov_matrix shape:",cov_matrix.shape)
print("Covariance_matrix",cov_matrix)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print("Eigen Vector \n%s", eigenvectors)
#Printing Eigen value
print("Eigne Value \n%s", eigenvalues)
# Make a set of (eigenvalue, eigenvector) pairs:
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
tot = sum(eigenvalues)
var_exp = [( i /tot ) * 100 for i in sorted(eigenvalues, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
# Ploting
plt.figure(figsize=(8 , 7))
plt.bar(range(1, eigenvalues.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eigenvalues.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
Now 8 dimensions seems very reasonable and With 8 variables we can explain over 95% of the variation in the original data!
# Using scikit learn PCA here. It does all the above steps and maps data to PCA dimensions in one shot
from sklearn.decomposition import PCA
# NOTE - we are generating only 10 PCA dimensions.
pca = PCA(n_components=10)
principalComponents = pca.fit_transform(X)
reduced_pca = pd.DataFrame(data = principalComponents)
reduced_pca.transpose()
pca.components_
#Let us check The Pairplot Of Reduced Dimension After PCA:
sns.pairplot(reduced_pca, diag_kind='kde')
It is clealry visible from the pairplot above that:
#PCA Data
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca,y,test_size=0.30,random_state=10)
# Since we will be using multiple models using the same data, thhe following is a user defined function for that purpose
def fit_n_print1(model, pca_X_train, pca_X_test, pca_y_train, pca_y_test): # take the model, and data as inputs
from sklearn import metrics
model.fit(pca_X_train, pca_y_train) # fir the model with the train data
pca_pred_train = model.predict(pca_X_train) # model predictions on the train data
pca_pred = model.predict(pca_X_test) # make predictions on the test set
score = round(model.score(pca_X_test, pca_y_test), 4) # compute accuracy score for test set
#Printing Confusion Matrix
cm = metrics.confusion_matrix(pca_pred,pca_y_test, labels = [0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in ["Van ", "Car ", "Bus"]],
columns = [i for i in ["Van ", "Car ", "Bus"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
print(classification_report(pca_y_test,pca_pred))
recall = metrics.recall_score(pca_y_test, pca_pred, average='macro')
precision = metrics.precision_score(pca_y_test, pca_pred, average='macro')
f1_score = round(2*precision*recall/(precision + recall),3)
return score, recall, precision, f1_score # return all the metrics
# Fitting Logistic Regression to the Training set
lr1 = LogisticRegression(random_state = 4)
lr1_accuracy, lr1_recall, lr1_precision, lr1_f1_score = fit_n_print1(lr1, pca_X_train, pca_X_test, pca_y_train, pca_y_test)
#Printing Accuracy Score for model
print("\n Logistic Model Score:",lr1_accuracy)
svm1 = SVC(gamma=0.025, C=3)
#fit the model on orighinal raw data
svm1.fit(pca_X_train,pca_y_train)
svm1_accuracy, svm1_recall, svm1_precision, svm1_f1_score = fit_n_print1(svm1, pca_X_train, pca_X_test, pca_y_train, pca_y_test)
print("\n Support Vector Machine Model Score:",svm1_accuracy)
#Use the SVM Classifier with k fold cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svm1,reduced_pca , y, cv=10)
print(scores)
print('Average score: ', np.mean(scores))
nb1 = GaussianNB()
nb1_accuracy, nb1_recall, nb1_precision, nb1_f1_score = fit_n_print1(nb1, pca_X_train, pca_X_test, pca_y_train, pca_y_test )
print("\n Naive Bayes Model Score:",nb1_accuracy)
import itertools
def classifiers_hypertune(name,rf,param_grid,x_train_scaled,y_train,x_test_scaled,y_test,CV):
from sklearn import metrics
CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=CV, verbose= 1, n_jobs =-1 )
CV_rf.fit(x_train_scaled, y_train)
y_pred_train = CV_rf.predict(x_train_scaled)
y_pred_test = CV_rf.predict(x_test_scaled)
#model.fit(X_train_scaled, y_train)
print('Best Score: ', CV_rf.best_score_)
print('Best Params: ', CV_rf.best_params_)
score = round(accuracy_score(y_test, y_pred_test),4)
#Classification Report
print(name+" Classification Report: ")
print(classification_report(y_test, y_pred_test))
#Printing Confusion Matrix
cm = metrics.confusion_matrix(y_pred_test,y_test, labels = [0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in ["Van ", "Car ", "Bus"]],
columns = [i for i in ["Van ", "Car ", "Bus"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
recall = metrics.recall_score(y_test, y_pred_test, average='macro')
precision = metrics.precision_score(y_test, y_pred_test, average='macro')
f1_score = round(2*precision*recall/(precision + recall),3)
return score, recall, precision, f1_score # return all the metrics
from sklearn.model_selection import GridSearchCV
svmc1 = SVC()
#Let's See What all parameters one can tweak
print("SVM Parameters:", svmc1.get_params())
# Create the parameter grid based on the results of random search
param_grid = [
{'C': [0.01, 0.05, 0.5, 1], 'kernel': ['linear']},
{'C': [0.01, 0.05, 0.5, 1], 'kernel': ['rbf']},
]
param_grid_1 = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
#K Fold Cross Validation Grid Search After PCA
svmc1_accuracy, svmc1_recall, svmc1_precision, svmc1_f1_score=classifiers_hypertune("Support Vector Machine",svmc1, param_grid,pca_X_train, pca_y_train, pca_X_test, pca_y_test,10)
print("\n SVM Model Score:",svmc1_accuracy)
#K-Fold Cross Validation on Original Data
svmc = SVC()
svmc_accuracy, svmc_recall, svmc_precision, svmc_f1_score=classifiers_hypertune("Support Vector Machine",svmc, param_grid,Orig_X_train, Orig_y_train, Orig_X_test, Orig_y_test,10)
print("\n SVM Model Score:",svmc_accuracy)
result = pd.DataFrame({'Model' : ['Before PCA Logistic Regression', 'After PCA Logistic Regression',"Before PCA Naive Bayes'", "After PCA Naive Bayes'", 'Before PCA SVM','After PCA SVM','Before PCA SVM with K Fold & Grid Serach', 'After PCA SVM with K Fold & Grid Search'],
'Accuracy': [ lr_accuracy , lr1_accuracy , nb_accuracy , nb1_accuracy , svm_accuracy, svm1_accuracy,svmc_accuracy,svmc1_accuracy],
'Recall' : [lr_recall , lr1_recall , nb_recall ,nb1_recall, svm_recall,svm1_recall,svmc_recall,svmc1_recall],
'Precision': [lr_precision , lr1_precision , nb_precision ,nb1_precision , svm_precision,svm1_precision,svmc_precision,svmc1_precision],
'F1 Score' : [lr_f1_score, lr1_f1_score, nb_f1_score, nb1_f1_score, svm_f1_score,svm1_f1_score,svmc_f1_score,svmc1_f1_score]})
result